Env setupu¶

In [1]:
import scanpy as sc, anndata as ad, numpy as np, pandas as pd
from scipy import sparse
import matplotlib.pyplot as plt
import plotly.express as px
from anndata import AnnData
from sklearn.neighbors import KernelDensity
from matplotlib.colors import TwoSlopeNorm

import random
from matplotlib import pylab
import warnings
import scanpy.external as sce 
import yaml

import seaborn as sns
import os
import sys

warnings.filterwarnings('ignore')
In [2]:
import anndata2ri
import rpy2.rinterface_lib.callbacks
import logging
In [3]:
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
In [4]:
anndata2ri.activate()
In [5]:
%load_ext rpy2.ipython
In [6]:
sc.settings.verbosity = 3  
sc.logging.print_header()
nprocs=4
np.random.seed(0)
with open("../data/resources/rcParams.yaml") as f:
    rcParamsDict = yaml.full_load(f)
    for k in rcParamsDict["rcParams"]:
        print("{} {}".format(k,rcParamsDict["rcParams"][k]))
        plt.rcParams[k] = rcParamsDict["rcParams"][k]
    for k1 in set(list(rcParamsDict)).difference(set(["rcParams"])):
        print("{} {}".format(k1,rcParamsDict[k1]))
scanpy==1.9.3 anndata==0.8.0 umap==0.5.1 numpy==1.20.3 scipy==1.10.1 pandas==2.0.3 scikit-learn==1.3.2 statsmodels==0.13.5 python-igraph==0.9.1 louvain==0.7.0 leidenalg==0.8.3 pynndescent==0.5.12
figure.dpi 80
savefig.dpi 500
figure.figsize [10, 10]
axes.facecolor None
figure.facecolor None
dotSize 20

Configure paths¶

In [7]:
outdir = "../data/output"

with open("../data/resources/iPSC_lines_map.yaml", 'r') as f:
    iPSC_lines_map = yaml.load(f, Loader=yaml.FullLoader)["lines"]
colorsmap = dict(zip([i["newName"] for i in iPSC_lines_map.values()],[i["color"] for i in iPSC_lines_map.values()]))


figDir = "./figures"
if not os.path.exists(figDir):
   # Create a new directory because it does not exist
   os.makedirs(figDir)


figDir = "./figures"
if not os.path.exists(figDir):
   # Create a new directory because it does not exist
   os.makedirs(figDir)
    
#tradeSeqUtilsPath = "/opt/tradeSeq_utils.R"
tradeSeqUtilsPath = "/opt/tradeSeq_utils.R"

var setting¶

In [8]:
branchTag = "Interneurons"
endpoint = "endpoint_Interneurons"
markers = ["RELN","PAX6","CBLN1","MKI67"]
min_cells = 50
MultiTPTstages = ["early","mid","late"]
binWidthRate = .05
ResamplingIterations = 50
leidenOrder = ["ProliferatingProgenitors", 'RadialGliaProgenitors', 'Neurons','Interneurons']
In [9]:
BranchBCs = sc.read_h5ad(outdir+"/adatas/adataPaga.h5ad")
sc.pl.draw_graph(BranchBCs[BranchBCs.obs[endpoint] == 1], color = ["cellID_newName","leidenAnnotated","dataset"], size = rcParamsDict["dotSize"],add_outline=True, outline_width=(0.2, 0.05), vmin='p2',vmax='p98', wspace=.4, ncols=2)
BranchBCs = BranchBCs.obs_names[BranchBCs.obs[endpoint] == 1].tolist()

harmonyTheta = 1
HVGs=pd.read_csv(outdir+"/HVG_list_intersection_Curated_"+branchTag+"_PagaTest.txt", sep = "\t")["HVG"]

Branch specifi pre-processing¶

In [10]:
adataBranch = sc.read_h5ad(outdir+'/adatas/MultiplexingPreprocessing_unscaled.h5ad')[BranchBCs]
adataBranch.obs["leidenAnnotated"] =  sc.read_h5ad(outdir+"/adatas/ClusterAnnotated_Base_filt.h5ad")[adataBranch.obs_names].obs["leidenAnnotated"]
adataBranch.uns["leidenAnnotated_colors"] =  sc.read_h5ad(outdir+"/adatas/adataPaga.h5ad")[adataBranch.obs_names].uns["leidenAnnotated_colors"]
goodcls = adataBranch.obs.groupby(["leidenAnnotated"]).size()[adataBranch.obs.groupby(["leidenAnnotated"]).size() >= 50].index.tolist()
adataBranch = adataBranch[adataBranch.obs["leidenAnnotated"].isin(goodcls)]


adataBranch.layers["unscaled"] = adataBranch.X.copy()


sc.pp.normalize_total(adataBranch, target_sum=1e4)

sc.pp.log1p(adataBranch)
adataBranch.raw = adataBranch



adataBranch = adataBranch[:,HVGs]
adataBranch.var["highly_variable"] = True
#sc.pp.highly_variable_genes(adataBranch, min_mean=0.0125, max_mean=5, min_disp=0.5, batch_key = "dataset")

sc.pp.regress_out(adataBranch, ['total_counts','pct_counts_mt'], n_jobs=4)
sc.pp.scale(adataBranch, zero_center=False)
#adataBranch.X = adataBranch.X + np.abs(adataBranch.X.min())
normalizing counts per cell
    finished (0:00:00)
regressing out ['total_counts', 'pct_counts_mt']
    sparse input is densified and may lead to high memory use
    finished (0:02:22)
In [11]:
sc.tl.pca(adataBranch, svd_solver='arpack', use_highly_variable=True)
sce.pp.harmony_integrate(adataBranch, 'dataset', max_iter_harmony=20,adjusted_basis="X_pca",random_state = 0, theta = harmonyTheta, lamb = 2)
sc.pp.neighbors(adataBranch,use_rep="X_pca",n_neighbors=50, n_pcs=10, random_state=0)

sc.tl.umap(adataBranch, random_state=0)
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:04)
2024-06-04 10:21:11,264 - harmonypy - INFO - Iteration 1 of 20
2024-06-04 10:21:14,575 - harmonypy - INFO - Iteration 2 of 20
2024-06-04 10:21:18,045 - harmonypy - INFO - Converged after 2 iterations
computing neighbors
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:11)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:31)
In [12]:
sc.pl.pca(adataBranch, color = ["dataset","stage",'leidenAnnotated'], size = rcParamsDict["dotSize"],add_outline=True, outline_width=(0.2, 0.05), vmin='p2',vmax='p98',wspace=.4)

DPT¶

In [13]:
sc.tl.diffmap(adataBranch)


RootName = adataBranch.obs_names[adataBranch[:, ["TOP2A"]].X.argmax()]
root_idx = np.where(adataBranch.obs_names == RootName)[0][0]
adataBranch.uns['iroot'] = root_idx


sc.tl.dpt(adataBranch)
sc.pl.diffmap(adataBranch, color = ["dpt_pseudotime","TOP2A","dataset"],size = rcParamsDict["dotSize"],add_outline=True, outline_width=(0.2, 0.05), vmin='p2',vmax='p98', legend_fontsize="xx-large", frameon=False, wspace=.3, legend_loc="on data")
computing Diffusion Maps using n_comps=15(=n_dcs)
computing transitions
    finished (0:00:00)
    eigenvalues of transition matrix
    [1.         0.9920156  0.97766703 0.97297156 0.95558953 0.9542264
     0.94159704 0.92929804 0.9245762  0.9054149  0.89950746 0.89710355
     0.89215136 0.88430285 0.8795954 ]
    finished: added
    'X_diffmap', diffmap coordinates (adata.obsm)
    'diffmap_evals', eigenvalues of transition matrix (adata.uns) (0:00:00)
computing Diffusion Pseudotime using n_dcs=10
    finished: added
    'dpt_pseudotime', the pseudotime (adata.obs) (0:00:00)
In [14]:
adataBranch.write_h5ad(outdir+"/adatas/AdataLineage."+branchTag+".h5ad")
In [15]:
timeCov="dpt_pseudotime"
from matplotlib.pyplot import figure
sc.settings.set_figure_params(dpi=100)
pylab.rcParams['figure.figsize'] = (10, 8)

By timepoint¶

In [16]:
sns.set_style("white")
In [17]:
cov = "stage"
In [18]:
#errors by dataset
for Stage in adataBranch.obs["stage"].unique(): 
    StageLT = adataBranch.obs[adataBranch.obs["stage"] == Stage]
    
    StageLT_smoothed = pd.DataFrame(index=np.linspace(0, 1, 10000))
    for dataset in StageLT["dataset"].unique():
        datasetLT = StageLT.loc[StageLT["dataset"] == dataset,timeCov]
        _bandwidth = round((datasetLT.max() - datasetLT.min()) * binWidthRate, 2)

        KernelData = datasetLT

        
        X = KernelData.values[:, np.newaxis]
        X_plot = np.linspace(0, 1, 10000)[:, np.newaxis]

        # Pick bandwidht
        kde_Calc = KernelDensity(kernel="gaussian", bandwidth=_bandwidth).fit(X)
        Y=np.exp(kde_Calc.score_samples(X_plot))
        
        StageLT_smoothed[dataset] = Y
        
    StageLT_smoothed["dpt_cut"] = StageLT_smoothed.index.tolist()
        
    # Calculate error and mean on smoothe values ()

    StageLT_smoothed["mean"] = StageLT_smoothed[[i for i in StageLT_smoothed.columns if i not in ['dpt_cut','stdev']]].mean(axis =1)
    StageLT_smoothed["stdev"] = StageLT_smoothed[[i for i in StageLT_smoothed.columns if i not in ['dpt_cut','stdev']]].std(axis =1)
    StageLT_smoothed["errMin"] = StageLT_smoothed["mean"] - StageLT_smoothed["stdev"]
    StageLT_smoothed["errMax"] = StageLT_smoothed["mean"] + StageLT_smoothed["stdev"]
    StageLT_smoothed["errMin"] = np.where(StageLT_smoothed["errMin"] < 0, 0, StageLT_smoothed["errMin"])
    
    plt.plot(StageLT_smoothed["dpt_cut"], StageLT_smoothed["mean"], '-', label =Stage,linewidth = 3,color = dict(zip(adataBranch.obs[cov].cat.categories.tolist(),adataBranch.uns["stage_colors"]))[Stage])
    plt.fill_between(StageLT_smoothed["dpt_cut"], StageLT_smoothed["errMin"], StageLT_smoothed["errMax"] , alpha=0.2,color = dict(zip(adataBranch.obs[cov].cat.categories.tolist(),adataBranch.uns["stage_colors"]))[Stage])
    plt.xlabel('pseudotime', size=30)
    plt.xticks(size=30)
    plt.yticks(size=30)

    plt.ylabel('Cells density', size=30)
    plt.legend(prop={'size': 30})
    plt.savefig(figDir+"/8_LineageFigures.dpt."+branchTag+".svg")
In [19]:
fig, ax  = plt.subplots(1,1, figsize=(20,10))

adataBranchCTL02 = adataBranch[adataBranch.obs["cellID_newName"] == "CTL02A"]
for Stage in adataBranchCTL02.obs["stage"].unique(): 
    StageLT = adataBranchCTL02.obs[adataBranchCTL02.obs["stage"] == Stage]
    
    StageLT_smoothed = pd.DataFrame(index=np.linspace(0, 1, 10000))
    for dataset in StageLT["dataset"].unique():
        datasetLT = StageLT.loc[StageLT["dataset"] == dataset,timeCov]
        _bandwidth = round((datasetLT.max() - datasetLT.min()) * binWidthRate, 2)
        KernelData = datasetLT
        
        
        X = KernelData.values[:, np.newaxis]
        X_plot = np.linspace(0, 1, 10000)[:, np.newaxis]

        # Pick bandwidht
        kde_Calc = KernelDensity(kernel="gaussian", bandwidth=_bandwidth).fit(X)
        Y=np.exp(kde_Calc.score_samples(X_plot))
        
        StageLT_smoothed[dataset] = Y
        
    StageLT_smoothed["dpt_cut"] = StageLT_smoothed.index.tolist()
        
    # Calculate error and mean on smoothe values ()

    StageLT_smoothed["mean"] = StageLT_smoothed[[i for i in StageLT_smoothed.columns if i not in ['dpt_cut','stdev']]].mean(axis =1)
    StageLT_smoothed["stdev"] = StageLT_smoothed[[i for i in StageLT_smoothed.columns if i not in ['dpt_cut','stdev']]].std(axis =1)
    StageLT_smoothed["errMin"] = StageLT_smoothed["mean"] - StageLT_smoothed["stdev"]
    StageLT_smoothed["errMax"] = StageLT_smoothed["mean"] + StageLT_smoothed["stdev"]
    StageLT_smoothed["errMin"] = np.where(StageLT_smoothed["errMin"] < 0, 0, StageLT_smoothed["errMin"])
    
    plt.plot(StageLT_smoothed["dpt_cut"], StageLT_smoothed["mean"], '-', label =Stage,linewidth = 3,color = dict(zip(adataBranchCTL02.obs[cov].cat.categories.tolist(),adataBranchCTL02.uns["stage_colors"]))[Stage])
    plt.fill_between(StageLT_smoothed["dpt_cut"], StageLT_smoothed["errMin"], StageLT_smoothed["errMax"] , alpha=0.2,color = dict(zip(adataBranchCTL02.obs[cov].cat.categories.tolist(),adataBranchCTL02.uns["stage_colors"]))[Stage])
    plt.xlabel('pseudotime', size=30)
    plt.xticks(size=30)
    plt.yticks(size=30)

    plt.ylabel('Cells density', size=30)
    plt.legend(prop={'size': 30})
    plt.savefig(figDir+"/8_LineageFigures.dpt.CTL02A."+branchTag+".svg")

By paradigm¶

In [19]:
cov = "type"
In [20]:
#with errors by dataset
for Stage in adataBranch.obs["type"].unique(): 
    StageLT = adataBranch.obs[adataBranch.obs["type"] == Stage]
    
    StageLT_smoothed = pd.DataFrame(index=np.linspace(0, 1, 10000))
    for dataset in StageLT["dataset"].unique():
        datasetLT = StageLT.loc[StageLT["dataset"] == dataset,timeCov]
        _bandwidth = round((datasetLT.max() - datasetLT.min()) * binWidthRate, 2)

        KernelData = datasetLT

        
        X = KernelData.values[:, np.newaxis]
        X_plot = np.linspace(0, 1, 10000)[:, np.newaxis]

        # Pick bandwidht
        kde_Calc = KernelDensity(kernel="gaussian", bandwidth=_bandwidth).fit(X)
        Y=np.exp(kde_Calc.score_samples(X_plot))
        
        StageLT_smoothed[dataset] = Y
        
    StageLT_smoothed["dpt_cut"] = StageLT_smoothed.index.tolist()
        
    # Calculate error and mean on smoothe values ()

    StageLT_smoothed["mean"] = StageLT_smoothed[[i for i in StageLT_smoothed.columns if i not in ['dpt_cut','stdev']]].mean(axis =1)
    StageLT_smoothed["stdev"] = StageLT_smoothed[[i for i in StageLT_smoothed.columns if i not in ['dpt_cut','stdev']]].std(axis =1)
    StageLT_smoothed["errMin"] = StageLT_smoothed["mean"] - StageLT_smoothed["stdev"]
    StageLT_smoothed["errMax"] = StageLT_smoothed["mean"] + StageLT_smoothed["stdev"]
    StageLT_smoothed["errMin"] = np.where(StageLT_smoothed["errMin"] < 0, 0, StageLT_smoothed["errMin"])
    
    plt.plot(StageLT_smoothed["dpt_cut"], StageLT_smoothed["mean"], '-', label =Stage,linewidth = 3,color = dict(zip(adataBranch.obs[cov].cat.categories.tolist(),adataBranch.uns["type_colors"]))[Stage])
    plt.fill_between(StageLT_smoothed["dpt_cut"], StageLT_smoothed["errMin"], StageLT_smoothed["errMax"] , alpha=0.2,color = dict(zip(adataBranch.obs[cov].cat.categories.tolist(),adataBranch.uns["type_colors"]))[Stage])

    plt.xlabel('pseudotime', size=30)
    plt.xticks(size=30)
    plt.yticks(size=30)

    plt.ylabel('Cells density', size=30)
    plt.legend(prop={'size': 30})
    plt.savefig(figDir+"/8_LineageFigures.Up_vs_Down.dpt."+branchTag+".svg")

Explore Genotypes¶

In [21]:
#######---original adata ------- Load Original adata and plot uncorrected PCA and original compositions

sc.settings.verbosity = 0 


adataInitPCA = adataBranch.copy()


compositions = pd.DataFrame(adataInitPCA.obs.groupby(["stage","cellID_newName"]).size())
compositions = compositions.reset_index().rename(columns={0:"number_of_cells"})
fig = px.bar(compositions, x="stage", y="number_of_cells", color="cellID_newName", 
             category_orders={"stage":["early","mid","late"]}, height=800,width=1000, template="plotly_white",
             color_discrete_map = dict(zip(adataInitPCA.obs["cellID_newName"].cat.categories, adataInitPCA.uns["cellID_newName_colors"])))

fig.update_traces(marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=1)

fig.show()

Downsampling loops¶

In [22]:
ReproducibilityDF = pd.DataFrame() 
ReproducibilityRaw = pd.DataFrame() 


adataInitPCA = adataInitPCA[adataInitPCA.obs["stage"].isin(MultiTPTstages)]
#Remove genotypes not in all timepoints
GoodIDs = pd.crosstab(adataInitPCA.obs.cellID_newName,adataInitPCA.obs.stage ) > min_cells
GoodIDs = GoodIDs[GoodIDs.sum(axis =1) == len(adataInitPCA.obs["stage"].unique())].index.tolist()
adataInitPCA = adataInitPCA[adataInitPCA.obs.cellID_newName.isin(GoodIDs)]
    
compositions = pd.DataFrame(adataInitPCA.obs.groupby(["stage","cellID_newName"]).size())
compositions = compositions.reset_index().rename(columns={0:"number_of_cells"})
fig = px.bar(compositions, x="stage", y="number_of_cells", color="cellID_newName", title="Balance Before each loop",
             category_orders={"stage":["early","mid","late"]}, height=800,width=1000, template="plotly_white",
             color_discrete_map = dict(zip(adataInitPCA.obs["cellID_newName"].cat.categories, adataInitPCA.uns["cellID_newName_colors"])))

fig.update_traces(marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=1)

fig.show()

    
for seed in range(ResamplingIterations):
    random.seed(seed)
    
    adataGenotypes = adataInitPCA.copy()


    #Subset timepoint if needed
    MinCells = pd.crosstab(adataGenotypes.obs.cellID_newName,adataGenotypes.obs.stage ) 
    MinCells = MinCells.min().to_dict()

    sampledBCs = []
    for stage in adataGenotypes.obs.stage.unique():
        samplingSize = MinCells[stage]
        sampledBCsStage = [ random.sample(adataGenotypes[(adataGenotypes.obs["stage"] == stage) & (adataGenotypes.obs["cellID_newName"] == ID)].obs_names.tolist() ,samplingSize) for ID in adataGenotypes.obs["cellID_newName"].unique().tolist()  ]
        sampledBCsStage = [item for sublist in sampledBCsStage for item in sublist]
        sampledBCs = sampledBCs + sampledBCsStage

    adataGenotypes = adataGenotypes[sampledBCs]
    
    #Trim extreme values
#     trimExtremes = [0.01,.99]
#     minDPT, maxDPT = np.quantile(adataGenotypes.obs["pc1"], trimExtremes)
#     adataGenotypes = adataGenotypes[(adataGenotypes.obs["pc1"] > minDPT) & (adataGenotypes.obs["pc1"] < maxDPT)]
    
    _bandwidth = round((adataGenotypes.obs["dpt_pseudotime"].max() - adataGenotypes.obs["dpt_pseudotime"].min()) * binWidthRate, 2)
    seedPseudotime = pd.DataFrame()
    seedPseudotimeRaw = pd.DataFrame()

    for cellID in adataGenotypes.obs["cellID_newName"].unique(): 
        LocalPseudotime = pd.DataFrame()
        LocalPseudotimeRaw = pd.DataFrame()
        KernelData = adataGenotypes.obs.loc[adataBranch.obs["cellID_newName"] == cellID, "dpt_pseudotime"]
        X = KernelData.values[:, np.newaxis]
        X_plot = np.linspace(adataGenotypes.obs["dpt_pseudotime"].min(), adataGenotypes.obs["dpt_pseudotime"].max(), 10000)[:, np.newaxis]
        # Pick bandwidht
        kde_Calc = KernelDensity(kernel="gaussian", bandwidth=_bandwidth).fit(X)
        Y=np.exp(kde_Calc.score_samples(X_plot))
        LocalPseudotime[seed] = Y
        LocalPseudotime["cellID"] = cellID
        LocalPseudotime["x"] = X_plot.flatten()
        seedPseudotime = pd.concat([seedPseudotime,LocalPseudotime ], ignore_index=True, axis = 0)
        #Raw values non smoothed
        LocalPseudotimeRaw[seed] = KernelData.tolist()
        LocalPseudotimeRaw["cellID"] = cellID
        seedPseudotimeRaw = pd.concat([seedPseudotimeRaw,LocalPseudotimeRaw], ignore_index=True)
    if "cellID" in ReproducibilityDF.columns:
        ReproducibilityDF = pd.concat([seedPseudotime[seed], ReproducibilityDF], axis = 1)
        ReproducibilityRaw = pd.concat([seedPseudotimeRaw[seed], ReproducibilityRaw], axis = 1)
    else:
        ReproducibilityDF = pd.concat([ReproducibilityDF, seedPseudotime], axis =1 )
        ReproducibilityRaw = pd.concat([ReproducibilityRaw, seedPseudotimeRaw], axis =1 )
 
        
        
compositions = pd.DataFrame(adataGenotypes.obs.groupby(["stage","cellID_newName"]).size())
compositions = compositions.reset_index().rename(columns={0:"number_of_cells"})
fig = px.bar(compositions, x="stage", y="number_of_cells", color="cellID_newName", title="Balance Operated whithin each loop",
             category_orders={"stage":["early","mid","late"]}, height=800,width=1000, template="plotly_white",
             color_discrete_map = dict(zip(adataGenotypes.obs["cellID_newName"].cat.categories, adataGenotypes.uns["cellID_newName_colors"])))

fig.update_traces(marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=1)

fig.show()
In [23]:
plotDF = ReproducibilityDF.copy()

plotDF["std"] = plotDF[[c for c in plotDF.columns if c not in ["x","cellID","std","mean",'max','min']]].std(axis =1)

plotDF["mean"] = plotDF[[c for c in plotDF.columns if c not in ["x","cellID","std","mean",'max','min']]].mean(axis =1)
plotDF["max"] = plotDF["mean"]+plotDF["std"]
plotDF["min"] = plotDF["mean"]-plotDF["std"]

for g in plotDF.cellID.unique():
    plotDFID = plotDF[plotDF.cellID == g]
    plt.plot(plotDFID["x"], plotDFID["mean"], '-',
             label =g,
             color = dict(zip(adataGenotypes.obs["cellID_newName"].cat.categories.tolist(),adataGenotypes.uns["cellID_newName_colors"]))[g],
             linewidth = 3)
    plt.fill_between(plotDFID["x"], plotDFID["min"], plotDFID["max"] , 
                     alpha=0.2,label=g,
                     color = dict(zip(adataGenotypes.obs["cellID_newName"].cat.categories.tolist(),adataGenotypes.uns["cellID_newName_colors"]))[g])

    plt.xlabel('pseudotime', size=30)
    plt.xticks(size=30)
    plt.yticks(size=30)
    plt.legend(prop={'size': 30})
    plt.savefig(figDir+"/8_LineageFigures.Genotypes."+branchTag+".svg")
In [24]:
DPTdf = plotDF.rename(columns=dict(zip([c for c in plotDF.columns if isinstance(c, int)], ["iteration."+str(c) for c in plotDF.columns if isinstance(c, int)]))).rename(columns={"x":"pseudotime_bin"})
DPTdf["Branch"] = branchTag
DPTdf.to_csv(outdir+"/PseudotimeGenotypeDensity."+branchTag+".tsv", index=False, sep="\t")

Tradeseq within lineage¶

In [24]:
plt.rcParams.update(plt.rcParamsDefault)
with open("../data/resources/rcParams.yaml") as f:
    rcParamsDict = yaml.full_load(f)
    for k in rcParamsDict["rcParams"]:
        print("{} {}".format(k,rcParamsDict["rcParams"][k]))
        plt.rcParams[k] = rcParamsDict["rcParams"][k]
    for k1 in set(list(rcParamsDict)).difference(set(["rcParams"])):
        print("{} {}".format(k1,rcParamsDict[k1]))
figure.dpi 80
savefig.dpi 500
figure.figsize [10, 10]
axes.facecolor None
figure.facecolor None
dotSize 20
In [25]:
obs = adataBranch.obs
obsm = adataBranch.obsm


adataTradeseq = sc.read_h5ad(outdir+'/adatas/MultiplexingPreprocessing_unscaled.h5ad')[obs.index]
#Sample extra genes to ensure fair unbyased normalization
ExtraNormGenes = random.sample(adataTradeseq.var_names[~adataTradeseq.var_names.isin(HVGs)].tolist(), len(HVGs.tolist()))
adataTradeseq = adataTradeseq[:,HVGs.tolist()+ExtraNormGenes]

adataTradeseq.obs = obs.loc[adataTradeseq.obs_names]
adataTradeseq = adataTradeseq[:,~adataTradeseq.var_names.str.startswith('MT-')]
adataTradeseq = adataTradeseq[:,~adataTradeseq.var_names.str.startswith(("RPS","RPL"))]
adataTradeseq.obsm = obsm
adataTradeseq.uns["leidenAnnotated_colors"] =  adataBranch[adataTradeseq.obs_names].uns["leidenAnnotated_colors"]


#Trim extreme values of DPT (for potentially low amount of cells)
trimExtremes = [0.01,.99]
minDPT, maxDPT = np.quantile(adataTradeseq.obs["dpt_pseudotime"], trimExtremes)
adataTradeseq = adataTradeseq[(adataTradeseq.obs["dpt_pseudotime"] > minDPT) & (adataTradeseq.obs["dpt_pseudotime"] < maxDPT)]
In [26]:
sc.pl.diffmap(adataTradeseq, color = ["dpt_pseudotime","leidenAnnotated"],
           size = rcParamsDict["dotSize"],add_outline=True, outline_width=(0.2, 0.05), vmin='p2',vmax='p98', frameon=False, wspace=.3, legend_loc="on data")
In [27]:
adataTradeseq.obs[endpoint] = 1
In [28]:
cellWeights = adataTradeseq.obs[endpoint]
cellWeights = np.matrix(cellWeights.to_numpy()).T


pseudotime = adataTradeseq.obs["dpt_pseudotime"]
pseudotime = np.matrix(pseudotime.to_numpy()).T



# Transpose needed!
counts = adataTradeseq.X.T.todense()
counts = counts + abs(counts.min())

varnames = adataTradeseq.var_names.tolist()
In [29]:
import anndata2ri
import rpy2
anndata2ri.activate()
%load_ext rpy2.ipython
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython
In [30]:
%%R -i tradeSeqUtilsPath

source(tradeSeqUtilsPath)
In [31]:
%%R -i cellWeights -i pseudotime -i counts -i varnames
set.seed(8)

library(tradeSeq)
library(ggplot2)


w <- as.matrix(cellWeights)
dpt <- as.matrix(pseudotime)
cMatrix <- as.matrix(counts)


gamObj <- fitGAM(cMatrix, verbose = TRUE, pseudotime = dpt, cellWeights = w, nknots = 8, sce=FALSE)
names(gamObj) <- varnames
  |                                                  | 0 % ~calculating   |+                                                 | 1 % ~08m 32s       |+                                                 | 2 % ~07m 46s       |++                                                | 3 % ~07m 40s       |++                                                | 4 % ~07m 20s       |+++                                               | 5 % ~07m 13s       |+++                                               | 6 % ~07m 25s       |++++                                              | 7 % ~07m 18s       |++++                                              | 8 % ~07m 14s       |+++++                                             | 9 % ~07m 05s       |+++++                                             | 10% ~06m 58s       |++++++                                            | 11% ~06m 53s       |++++++                                            | 12% ~06m 47s       |+++++++                                           | 13% ~06m 45s       |+++++++                                           | 14% ~06m 43s       |++++++++                                          | 15% ~06m 41s       |++++++++                                          | 16% ~06m 34s       |+++++++++                                         | 17% ~06m 29s       |+++++++++                                         | 18% ~06m 27s       |++++++++++                                        | 19% ~06m 20s       |++++++++++                                        | 20% ~06m 17s       |+++++++++++                                       | 21% ~06m 12s       |+++++++++++                                       | 22% ~06m 10s       |++++++++++++                                      | 23% ~06m 05s       |++++++++++++                                      | 24% ~05m 58s       |+++++++++++++                                     | 25% ~05m 52s       |+++++++++++++                                     | 26% ~05m 47s       |++++++++++++++                                    | 27% ~05m 42s       |++++++++++++++                                    | 28% ~05m 37s       |+++++++++++++++                                   | 29% ~05m 31s       |+++++++++++++++                                   | 30% ~05m 26s       |++++++++++++++++                                  | 31% ~05m 22s       |++++++++++++++++                                  | 32% ~05m 17s       |+++++++++++++++++                                 | 33% ~05m 12s       |+++++++++++++++++                                 | 34% ~05m 06s       |++++++++++++++++++                                | 35% ~05m 02s       |++++++++++++++++++                                | 36% ~04m 57s       |+++++++++++++++++++                               | 37% ~04m 53s       |+++++++++++++++++++                               | 38% ~04m 49s       |++++++++++++++++++++                              | 39% ~04m 46s       |++++++++++++++++++++                              | 40% ~04m 41s       |+++++++++++++++++++++                             | 41% ~04m 36s       |+++++++++++++++++++++                             | 42% ~04m 31s       |++++++++++++++++++++++                            | 43% ~04m 26s       |++++++++++++++++++++++                            | 44% ~04m 20s       |+++++++++++++++++++++++                           | 45% ~04m 15s       |+++++++++++++++++++++++                           | 46% ~04m 10s       |++++++++++++++++++++++++                          | 47% ~04m 05s       |++++++++++++++++++++++++                          | 48% ~04m 01s       |+++++++++++++++++++++++++                         | 49% ~03m 56s       |+++++++++++++++++++++++++                         | 50% ~04m 33s       |++++++++++++++++++++++++++                        | 51% ~04m 38s       |++++++++++++++++++++++++++                        | 52% ~04m 53s       |+++++++++++++++++++++++++++                       | 53% ~05m 01s       |+++++++++++++++++++++++++++                       | 54% ~05m 02s       |++++++++++++++++++++++++++++                      | 55% ~05m 05s       |++++++++++++++++++++++++++++                      | 56% ~05m 07s       |+++++++++++++++++++++++++++++                     | 57% ~05m 11s       |+++++++++++++++++++++++++++++                     | 58% ~05m 10s       |++++++++++++++++++++++++++++++                    | 59% ~05m 15s       |++++++++++++++++++++++++++++++                    | 60% ~05m 14s       |+++++++++++++++++++++++++++++++                   | 61% ~05m 24s       |+++++++++++++++++++++++++++++++                   | 62% ~05m 20s       |++++++++++++++++++++++++++++++++                  | 63% ~05m 24s       |++++++++++++++++++++++++++++++++                  | 64% ~05m 22s       |+++++++++++++++++++++++++++++++++                 | 65% ~05m 28s       |+++++++++++++++++++++++++++++++++                 | 66% ~05m 21s       |++++++++++++++++++++++++++++++++++                | 67% ~05m 24s       |++++++++++++++++++++++++++++++++++                | 68% ~05m 15s       |+++++++++++++++++++++++++++++++++++               | 69% ~05m 09s       |+++++++++++++++++++++++++++++++++++               | 70% ~05m 05s       |++++++++++++++++++++++++++++++++++++              | 71% ~05m 03s       |++++++++++++++++++++++++++++++++++++              | 72% ~04m 57s       |+++++++++++++++++++++++++++++++++++++             | 73% ~04m 56s       |+++++++++++++++++++++++++++++++++++++             | 74% ~04m 47s       |++++++++++++++++++++++++++++++++++++++            | 75% ~04m 41s       |++++++++++++++++++++++++++++++++++++++            | 76% ~04m 41s       |+++++++++++++++++++++++++++++++++++++++           | 77% ~04m 30s       |+++++++++++++++++++++++++++++++++++++++           | 78% ~04m 20s       |++++++++++++++++++++++++++++++++++++++++          | 79% ~04m 10s       |++++++++++++++++++++++++++++++++++++++++          | 80% ~04m 04s       |+++++++++++++++++++++++++++++++++++++++++         | 81% ~03m 53s       |+++++++++++++++++++++++++++++++++++++++++         | 82% ~03m 41s       |++++++++++++++++++++++++++++++++++++++++++        | 83% ~03m 30s       |++++++++++++++++++++++++++++++++++++++++++        | 84% ~03m 20s       |+++++++++++++++++++++++++++++++++++++++++++       | 85% ~03m 10s       |+++++++++++++++++++++++++++++++++++++++++++       | 86% ~02m 60s       |++++++++++++++++++++++++++++++++++++++++++++      | 87% ~02m 47s       |++++++++++++++++++++++++++++++++++++++++++++      | 88% ~02m 35s       |+++++++++++++++++++++++++++++++++++++++++++++     | 89% ~02m 27s       |+++++++++++++++++++++++++++++++++++++++++++++     | 90% ~02m 14s       |++++++++++++++++++++++++++++++++++++++++++++++    | 91% ~02m 01s       |++++++++++++++++++++++++++++++++++++++++++++++    | 92% ~01m 49s       |+++++++++++++++++++++++++++++++++++++++++++++++   | 93% ~01m 36s       |+++++++++++++++++++++++++++++++++++++++++++++++   | 94% ~01m 23s       |++++++++++++++++++++++++++++++++++++++++++++++++  | 95% ~01m 09s       |++++++++++++++++++++++++++++++++++++++++++++++++  | 96% ~55s           |+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~41s           |+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~28s           |++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~14s           |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=23m 01s
In [37]:
%%R -i tradeSeqUtilsPath

source(tradeSeqUtilsPath)
In [38]:
dptseq = np.linspace(pseudotime.min(),pseudotime.max(), 5)
nGenes = 3

leidenCols = dict(zip(adataTradeseq.obs.leidenAnnotated.cat.categories, adataTradeseq.uns["leidenAnnotated_colors"]))
In [39]:
%%R  -o startRes -i minDPT -i maxDPT


startRes <- startVsEndTest(gamObj, pseudotimeValues = c(minDPT+.01,maxDPT-.01))
In [40]:
# Top Up and Down
startResFilt = startRes[(startRes.pvalue <= 0.001) & (abs(startRes.logFClineage1) >= 2)]
startResFilt["Gene"] = startResFilt.index.tolist()
startResFilt["test"] = "startVsEndTest"
startResGenesPositive = startResFilt[startResFilt.logFClineage1 < 0 ].sort_values(["pvalue","logFClineage1"], ascending=[True,True])
startResGenesPositive_plot = startResGenesPositive.head(nGenes).index.tolist()
startResGenesPositive["PatternType"] = "decreasing"
startResGenesPositive = startResGenesPositive.rename(columns={"logFClineage1":"logFC"})

startResGenesNegatives = startResFilt[startResFilt.logFClineage1 > 0 ].sort_values(["pvalue","logFClineage1"], ascending=[True,False])
startResGenesNegatives_plot = startResGenesNegatives.head(nGenes).index.tolist()
startResGenesNegatives["PatternType"] = "increasing"
startResGenesNegatives = startResGenesNegatives.rename(columns={"logFClineage1":"logFC"})
In [41]:
CombinedDF = pd.concat([startResGenesPositive,startResGenesNegatives], ignore_index=True)
CombinedDF["Branch"] = branchTag
CombinedDF.to_csv(outdir+"/Tradeseq."+branchTag+".tsv", sep="\t", index=False)
In [42]:
%%R -i startResGenesPositive_plot -i startResGenesNegatives_plot -o assoResSSPandas

CombinedList <- c(startResGenesPositive_plot,startResGenesNegatives_plot)

datalist <- list()
for (g in CombinedList){
    
    localModel <- gamObj[[g]]    
    data <- localModel$model    
    y <- data$y
    nCurves <- length(localModel$smooth)

    for (jj in seq_len(nCurves)) {
        df <- .getPredictRangeDf(localModel$model, jj, nPoints = 100)
        yhat <- predict(localModel, newdata = df, type = "response")

        Newframe <- data.frame("fittedCounts" = yhat)
        colnames(Newframe) <- c(paste0("fittedCounts.",g))

        datalist[[paste(g,jj)]] <- Newframe

        
        

    }

}

assoResSSPandas =  do.call(cbind, datalist)
In [43]:
#Curatedmarkerslist = CuratedmarkersDF.loc[CuratedmarkersDF.score > 1,"name"].tolist()
#CuratedmarkersDF = pd.read_csv(markers, header=None, sep = "\t", names=["name","area","score"])
#Curatedmarkerslist = CuratedmarkersDF.loc[CuratedmarkersDF.score >= 1,"name"].tolist()

##############################


###################à
plt.rcdefaults()

sns.set_style("ticks", {'axes.grid' : False})


fig, axes = plt.subplots(2,1, figsize=(12,5),sharex=True, 
                         gridspec_kw={'height_ratios': [1, 3]})

sns.boxplot(data=adataTradeseq.obs, x="dpt_pseudotime", y="leidenAnnotated", ax = axes[0],
            palette=leidenCols,
            showfliers=False,order=leidenOrder)

    

    
    


#fig, axes = plt.subplots(2,1, figsize=(10,5),sharex=True, 
#                         gridspec_kw={'height_ratios': [1, 2]})

#sns.histplot(data=adataTradeseq.obs, x="dpt_pseudotime", hue="leidenAnnotated", ax=axes[0])

# sns.histplot(data=adataTradeseq.obs, x="dpt_pseudotime", hue="leidenAnnotated", ax=axes[0],bins=100,
#               palette=dict(zip(adata.obs.leidenAnnotated.cat.categories, adata.uns["leidenAnnotated_colors"])), alpha=.5)




SmoothedExpr = assoResSSPandas.copy()
SmoothedExpr["pseudotime"] = SmoothedExpr.index.tolist()      
SmoothedExpr["pseudotime"] =  np.linspace(pseudotime.min(),pseudotime.max(), 100)
SmoothedExpr = SmoothedExpr.melt("pseudotime")
SmoothedExpr["value"] = np.log(SmoothedExpr["value"])


#TrimExtremes


sns.lineplot(data=SmoothedExpr, x="pseudotime", y="value", hue= "variable",  linewidth=3, ax=axes[1])
#axes[1].lines[1].set_linestyle("--")





dptAxis = np.linspace(SmoothedExpr.pseudotime.min(), SmoothedExpr.pseudotime.max(), 10).round(2)      
ylimits = np.array([SmoothedExpr.value.min()-abs((SmoothedExpr.value.min()*.1)), 
                    SmoothedExpr.value.max()+abs((SmoothedExpr.value.max()*.1))]).round(2)
exprAxis = np.linspace(ylimits.min(), ylimits.max(),5).round(2)


axes[1].set_xlim([dptAxis.min(), dptAxis.max()])
axes[1].set_ylim(ylimits)


axes[0].yaxis.tick_right()
axes[0].spines[['left', 'top','bottom']].set_visible(False)
axes[0].spines['right'].set_linewidth(1.5)
axes[0].tick_params(axis='y', which='major', labelsize=15, width=1.5, length=10)
axes[0].tick_params(axis='x', which='major', labelsize=15, width=0, length=10)


axes[1].set_xticks(dptAxis)
axes[1].set_yticks(exprAxis)

axes[1].spines['left'].set_linewidth(1.5)
axes[1].spines['bottom'].set_linewidth(1.5)
axes[1].tick_params(axis='both', which='major', labelsize=15, width=1.5, length=10)
axes[1].spines[['right', 'top']].set_visible(False)
axes[1].xaxis.tick_bottom()
axes[1].yaxis.tick_left()
axes[1].legend(title='Gene', bbox_to_anchor=(1, .7))


sns.despine(ax=axes[1], top=True, right=True, left=False, bottom=False, offset=10, trim=False)
sns.despine(ax=axes[0], top=True, right=False, left=False, bottom=True, offset=10, trim=False)
axes[0].yaxis.set_label_position("right")
axes[0].spines[['left', 'top','bottom']].set_visible(False)

axes[1].set(xlabel='dpt pseudotime', ylabel='Log smoothed expression')



fig.show()
        


fig.savefig(figDir+'/TopGenes.'+branchTag+'.pdf',bbox_inches='tight')
In [ ]:

In [ ]: